import pandas as pd
seed=77
def load_dataset(path):
return pd.read_csv(path, index_col=0)
df = load_dataset("./sample/sample_12_May_2020.csv")
df.info()
from sklearn.model_selection import train_test_split, cross_val_score, ShuffleSplit
X_train, X_test = train_test_split(df, test_size=0.3, random_state=seed)
def extract_dv(df):
return df.status == 'SUCCESS'
display(extract_dv(X_train).head())
display(extract_dv(X_test).head())
df.head()
def print_uniqueValue(df):
df_unique = pd.DataFrame()
for col_name in df.columns:
df_unique[col_name] = [len(df[col_name].unique())]
df_unique['total'] = [len(df)]
df_unique.index = ['unique count']
display(df_unique.T)
print_uniqueValue(df)
from sklearn.pipeline import Pipeline
from sklearn.base import BaseEstimator,TransformerMixin
pd.options.mode.chained_assignment = None
class URLLengthCounter(BaseEstimator, TransformerMixin):
def __init__(self):
pass
def fit(self,x,y=None):
return self
def transform(self,x,y=None):
result = x
result.loc[:,'url_length'] = result['url'].apply(self._get_length)
return result
def _get_length(self, url):
return len(url)
pipe = Pipeline([
('url_length_counter', URLLengthCounter()),
])
result = pipe.transform(X_train)
display(result[['url', 'url_length']].head(5))
class URLDepthCounter(BaseEstimator, TransformerMixin):
def __init__(self):
pass
def fit(self,x,y=None):
return self
def transform(self,x,y=None):
result = x
result.loc[:,'url_depth'] = result['path'].apply(self._get_depth)
return result
def _get_depth(self, path):
last_idx = path.rindex('/')
if last_idx + 1 < len(path):
last_idx = len(path)
return path[:last_idx].count('/')
pipe = Pipeline([
('url_length_counter', URLLengthCounter()),
('url_depth_counter', URLDepthCounter()),
])
result = pipe.transform(X_train)
display(result[['path', 'url_depth']].head(5))
class HasWWWConverter(BaseEstimator, TransformerMixin):
def __init__(self):
pass
def fit(self,x,y=None):
return self
def transform(self,x,y=None):
result = x
result.loc[:, 'has_www'] = result['netloc'].apply(self._has_www)
return result
def _has_www(self, domain):
return int(domain.startswith('www.'))
pipe = Pipeline([
('url_length_counter', URLLengthCounter()),
('url_depth_counter', URLDepthCounter()),
('has_www_converter', HasWWWConverter()),
])
result = pipe.transform(X_train)
display(result[['netloc', 'has_www']].head(5))
class SubdomainLevelCounter(BaseEstimator, TransformerMixin):
def __init__(self):
pass
def fit(self,x,y=None):
return self
def transform(self,x,y=None):
result = x
result.loc[:, 'subdomain_level'] = result['netloc'].apply(self._get_level)
return result
def _get_level(self, domain):
return domain.count('.')
pipe = Pipeline([
('url_length_counter', URLLengthCounter()),
('url_depth_counter', URLDepthCounter()),
('has_www_converter', HasWWWConverter()),
('subdomain_level_counter', SubdomainLevelCounter()),
])
result = pipe.transform(X_train)
display(result[['netloc', 'subdomain_level']].head(5))
import numpy as np
class RequestParameterCounter(BaseEstimator, TransformerMixin):
def __init__(self):
pass
def fit(self,x,y=None):
return self
def transform(self,x,y=None):
result = x
result['params'] = result['params'].replace(np.nan, '', regex=True)
result.loc[:, 'param_cnt'] = result['params'].apply(self._count_param)
return result
def _count_param(self, params):
if params is '':
return 0
return params.count('&') + 1
pipe = Pipeline([
('url_length_counter', URLLengthCounter()),
('url_depth_counter', URLDepthCounter()),
('has_www_converter', HasWWWConverter()),
('subdomain_level_counter', SubdomainLevelCounter()),
('request_parameter_counter', RequestParameterCounter()),
])
result = pipe.transform(X_train)
display(result[['params', 'param_cnt']].head(5))
!pip3 install feature_engine
from feature_engine import categorical_encoders
class DomainSuffixBuilder(BaseEstimator, TransformerMixin):
def __init__(self):
self._suffix_mapping = None
def fit(self,x,y=None):
return self
def transform(self,x,y=None):
result = x
# Remove incorrect urls
result = result[result['netloc'].apply(lambda x: '.' in x)]
result.loc[:, 'suffix'] = result.netloc.apply(DomainSuffixBuilder._get_url_suffix)
result.loc[:, 'is_port_access'] = result.suffix.apply(DomainSuffixBuilder._is_port_access)
result.loc[:, 'suffix_idx'] = result.suffix.apply(DomainSuffixBuilder._clean_url_suffix)
encoder = categorical_encoders.CountFrequencyCategoricalEncoder(
encoding_method='frequency',
variables=['suffix'])
result = encoder.fit_transform(result)
self._suffix_dict = encoder.encoder_dict_['suffix']
return result
@property
def suffix_dict(self):
return self._suffix_dict
@staticmethod
def _get_url_suffix(url):
last_idx = url.rindex('.')
return url[last_idx + 1:]
@staticmethod
def _clean_url_suffix(url):
return url.split(':')[0]
@staticmethod
def _is_port_access(suffix):
return int(len([token for token in suffix.split(':') if token.strip() != ''])>1)
pipe = Pipeline([
('url_length_counter', URLLengthCounter()),
('url_depth_counter', URLDepthCounter()),
('has_www_converter', HasWWWConverter()),
('subdomain_level_counter', SubdomainLevelCounter()),
('request_parameter_counter', RequestParameterCounter()),
('domain_suffix_builder', DomainSuffixBuilder()),
])
result = pipe.transform(X_train)
display(result[['netloc', 'is_port_access', 'suffix', 'suffix_idx']].head(5))
pipe.steps[-1][1].suffix_dict
Ref: https://en.wikipedia.org/wiki/Domain_Name_System#cite_ref-rfc1034_1-2
import re
class IncorrectDomainUrlCleaner(BaseEstimator, TransformerMixin):
def __init__(self):
# TLD ranges from 2 to 63
self._regex = re.compile(r'^[a-zA-Z]{2,63}$', re.I)
def fit(self,x,y=None):
return self
def transform(self,x,y=None):
result = x
result.loc[:, 'is_correct'] = result.suffix_idx.apply(self._is_correct)
result = result[result.is_correct]
result = result.drop('is_correct', axis=1)
return result
def _is_correct(self, domain_suffix):
return True if self._regex.match(domain_suffix) else False
pipe = Pipeline([
('url_length_counter', URLLengthCounter()),
('url_depth_counter', URLDepthCounter()),
('has_www_converter', HasWWWConverter()),
('subdomain_level_counter', SubdomainLevelCounter()),
('request_parameter_counter', RequestParameterCounter()),
('domain_suffix_builder', DomainSuffixBuilder()),
('incorrect_domain_url_cleaner', IncorrectDomainUrlCleaner()),
])
result = pipe.transform(X_train)
print(f'Before changes: {len(X_train)}')
print(f'After changes: {len(result)}')
from feature_engine import categorical_encoders
class ColumnRenamer(BaseEstimator, TransformerMixin):
def __init__(self, mapping):
self._mapping = mapping
@property
def mapping(self):
return self._mapping
def fit(self,x,y=None):
return self
def transform(self,x,y=None):
result = x
self._mapping = {key: value for key, value in self._mapping.items() if key in result.columns}
result = result.rename(columns=self._mapping)
return result
pipe = Pipeline([
('url_length_counter', URLLengthCounter()),
('url_depth_counter', URLDepthCounter()),
('has_www_converter', HasWWWConverter()),
('subdomain_level_counter', SubdomainLevelCounter()),
('request_parameter_counter', RequestParameterCounter()),
('domain_suffix_builder', DomainSuffixBuilder()),
('incorrect_domain_url_cleaner', IncorrectDomainUrlCleaner()),
('column_renamer', ColumnRenamer({'scheme': 'protocol_type'})),
])
result = pipe.transform(X_train)
display(result[['url', 'protocol_type']].head(5))
print_uniqueValue(result)
result.info()
import plotly
import plotly.graph_objects as go
from plotly.subplots import make_subplots
pipe = Pipeline([
('url_length_counter', URLLengthCounter()),
('url_depth_counter', URLDepthCounter()),
('has_www_converter', HasWWWConverter()),
('subdomain_level_counter', SubdomainLevelCounter()),
('request_parameter_counter', RequestParameterCounter()),
('domain_suffix_builder', DomainSuffixBuilder()),
('incorrect_domain_url_cleaner', IncorrectDomainUrlCleaner()),
('column_renamer', ColumnRenamer({'scheme': 'protocol_type'})),
])
result = pipe.transform(X_train)
non_binary_result = result[['protocol_type', 'url_length', 'url_depth', 'subdomain_level', 'param_cnt', 'suffix_idx']]
def plot_distribution(data, title):
fig = make_subplots(rows=len(data.columns), cols=1,
subplot_titles=data.columns)
for idx, col_name in enumerate(data.columns):
fig.add_trace(go.Histogram(x=data[col_name], name=col_name), row=idx + 1, col=1)
fig.update_layout(height=1200, width=800, title_text=title)
return fig
plot_distribution(non_binary_result, "Non Binary Features Distribution")
binary_result = result[['status', 'has_www', 'is_port_access']]
plot_distribution(binary_result, "Binary Features Distribution")
Most of the non-binary feature are right skewed, it is necessary to apply the standard scaler at the later process.
Timesamp will not be used in the logistic regression, it is directly correlated with the dependent variable
import time
import datetime
class TimeseriesConverter(BaseEstimator, TransformerMixin):
def __init__(self):
self._scraped_dt = datetime.datetime.strptime('20200513132015', "%Y%m%d%H%M%S")
def fit(self,x,y=None):
return self
def transform(self,x,y=None):
result = x
result.loc[:, 'timestamp'] = result.timestamp.astype(str)
result.loc[:, 'timestamp'] = result.timestamp.apply(lambda x: datetime.datetime.strptime(x, "%Y%m%d%H%M%S"))
result.loc[:, 'timestamp_coef'] = 1/(self._scraped_dt - result.timestamp).apply(lambda x: x.days)
return result
pipe = Pipeline([
('url_length_counter', URLLengthCounter()),
('url_depth_counter', URLDepthCounter()),
('has_www_converter', HasWWWConverter()),
('subdomain_level_counter', SubdomainLevelCounter()),
('request_parameter_counter', RequestParameterCounter()),
('domain_suffix_builder', DomainSuffixBuilder()),
('incorrect_domain_url_cleaner', IncorrectDomainUrlCleaner()),
('column_renamer', ColumnRenamer({'scheme': 'protocol_type'})),
('timeseries_converter', TimeseriesConverter()),
])
result = pipe.transform(X_train)
result.timestamp_coef.head()
class FeatureRemover(BaseEstimator, TransformerMixin):
def __init__(self, features):
self._removed_features = None
self._features = features
@property
def removed_features(self):
return self._removed_features
def fit(self,x,y=None):
return self
def transform(self,x,y=None):
result = x
self._removed_features = [col_name for col_name in self._features if col_name in result.columns]
result = result.drop(self._removed_features, axis=1)
return result
class FeaturePicker(BaseEstimator, TransformerMixin):
def __init__(self, features):
self._picked_features = None
self._features = features
@property
def picked_features(self):
return self._picked_features
def fit(self,x,y=None):
return self
def transform(self,x,y=None):
result = x
self._picked_features = [col_name for col_name in self._features if col_name in result.columns]
result = result[self._picked_features]
return result
pipe = Pipeline([
('url_length_counter', URLLengthCounter()),
('url_depth_counter', URLDepthCounter()),
('has_www_converter', HasWWWConverter()),
('subdomain_level_counter', SubdomainLevelCounter()),
('request_parameter_counter', RequestParameterCounter()),
('domain_suffix_builder', DomainSuffixBuilder()),
('incorrect_domain_url_cleaner', IncorrectDomainUrlCleaner()),
('column_renamer', ColumnRenamer({'scheme': 'protocol_type'})),
# ('timeseries_converter', TimeseriesConverter()),
('feature_picker', FeaturePicker(['protocol_type',
'url_depth',
'has_www',
'subdomain_level',
'param_cnt',
'suffix',
'timestamp_coef',
'is_port_access',
'status',
])),
])
result = pipe.transform(X_train)
result.columns
from itertools import compress
from sklearn import feature_selection
class LowVarianceRemover(BaseEstimator, TransformerMixin):
def __init__(self, threshold):
self._p = threshold
self._bi_vt = feature_selection.VarianceThreshold(threshold=threshold*(1-threshold))
self._regular_vt = feature_selection.VarianceThreshold(threshold=threshold)
self._dropped_columns = list()
@property
def threshold(self):
return self._threshold
@property
def dropped_columns(self):
return self._dropped_columns
def fit(self,x,y=None):
return self
def transform(self,x,y=None):
result = x
df_unique = pd.DataFrame()
for col_name in result.columns:
if 'status' != col_name:
df_unique[col_name] = [len(result[col_name].unique())]
df_unique.index = ['unique count']
df_unique = df_unique.T.squeeze()
bi_columns = df_unique[df_unique == 2].index.tolist()
regular_columns = df_unique[df_unique != 2].index.tolist()
if len(bi_columns) >0:
self._bi_vt.fit(result[bi_columns])
bi_mask = self._bi_vt.variances_ < self._p * (1 - self._p)
self._dropped_columns = self._dropped_columns + list(compress(bi_columns, bi_mask))
if len(regular_columns) >0 :
self._regular_vt.fit(result[regular_columns])
regular_mask = self._regular_vt.variances_ < self._p
self._dropped_columns = self._dropped_columns + list(compress(regular_columns, regular_mask))
if len(self._dropped_columns) > 0:
remover = FeatureRemover(self._dropped_columns)
result = remover.transform(result)
return result
pipe = Pipeline([
('url_length_counter', URLLengthCounter()),
('url_depth_counter', URLDepthCounter()),
('has_www_converter', HasWWWConverter()),
('subdomain_level_counter', SubdomainLevelCounter()),
('request_parameter_counter', RequestParameterCounter()),
('domain_suffix_builder', DomainSuffixBuilder()),
('incorrect_domain_url_cleaner', IncorrectDomainUrlCleaner()),
('column_renamer', ColumnRenamer({'scheme': 'protocol_type'})),
# ('timeseries_converter', TimeseriesConverter()),
('feature_picker', FeaturePicker(['protocol_type',
'url_depth',
'has_www',
'subdomain_level',
'param_cnt',
'suffix',
'timestamp_coef',
'is_port_access',
'status',
])),
('frequency_indexer', categorical_encoders.CountFrequencyCategoricalEncoder(
encoding_method='frequency',
variables=['protocol_type'])),
('low_variance_remover', LowVarianceRemover(0.01))
])
result = pipe.fit_transform(X_train)
print(f'Before transform: {X_train.columns}\n')
print(f'After transform: {result.columns}\n')
print(f'Dropped columns: {pipe.steps[-1][1].dropped_columns}')
The port indicator is wiped out, but I believe this could be a reason to explain the availability of the url resource, so I will separately build a subset to analyze that part later.
from sklearn import preprocessing
class CustomizedStandardizer(BaseEstimator, TransformerMixin):
def __init__(self, norm='l2'):
self._pipe = Pipeline([
('normalizer', preprocessing.Normalizer(norm=norm, copy=True)),
('standard_scaler', preprocessing.StandardScaler()),
])
self._columns = None
@property
def columns(self):
return self._columns
def fit(self,x,y=None):
return self
def transform(self,x,y=None):
self._columns = x.drop('status', axis=1).columns
self._columns = [*self._columns, 'status']
result = self._pipe.fit_transform(x.drop('status', axis=1))
dv = x.status.apply(lambda v: 1 if 'SUCCESS' == v else 0).tolist()
dv = np.array([dv]).T
result = np.append(result, dv, axis=1)
return result
pipe = Pipeline([
('url_length_counter', URLLengthCounter()),
('url_depth_counter', URLDepthCounter()),
('has_www_converter', HasWWWConverter()),
('subdomain_level_counter', SubdomainLevelCounter()),
('request_parameter_counter', RequestParameterCounter()),
('domain_suffix_builder', DomainSuffixBuilder()),
('incorrect_domain_url_cleaner', IncorrectDomainUrlCleaner()),
('column_renamer', ColumnRenamer({'scheme': 'protocol_type'})),
# ('timeseries_converter', TimeseriesConverter()),
('feature_picker', FeaturePicker(['protocol_type',
'url_depth',
'has_www',
'subdomain_level',
'param_cnt',
'suffix',
'timestamp_coef',
'is_port_access',
'status'
])),
('frequency_indexer', categorical_encoders.CountFrequencyCategoricalEncoder(
encoding_method='frequency',
variables=['protocol_type'])),
('low_variance_remover', LowVarianceRemover(0.01)),
('standard_scaler', CustomizedStandardizer(norm='l2')),
])
result = pipe.fit_transform(X_train)
result = pd.DataFrame(result, columns= pipe.steps[-1][1].columns)
plot_distribution(result, "Standardized Features Distribution")
import gc
import multiprocessing
import warnings
warnings.filterwarnings("ignore")
cpu_cnt = multiprocessing.cpu_count()
allocated_cpu = cpu_cnt
print(f"Allocated {allocated_cpu} CPUs")
gc.collect()
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import roc_auc_score
from sklearn.metrics import roc_curve
from sklearn.naive_bayes import GaussianNB
class AnalysisEngineBuilder:
def __init_(self):
self._X_train = None
self._y_train = None
self._X_test = None
self._y_test = None
self._param_grid = None
self._engine = None
def set_X_train(self, X_train):
self._X_train = X_train
return self
def set_y_train(self, y_train):
self._y_train = y_train
return self
def set_X_test(self, X_test):
self._X_test = X_test
return self
def set_y_test(self, y_test):
self._y_test = y_test
return self
def set_param_grid(self, param_grid):
self._param_grid = param_grid
return self
def set_engine(self, engine):
self._engine = engine
return self
def build(self):
return AnalysisEngineBuilder._AnalysisEngine(self._X_train, self._y_train, self._X_test, self._y_test, self._param_grid, self._engine)
class _AnalysisEngine:
def __init__(self, X_train, y_train, X_test, y_test, param_grid, engine):
self._X_train = X_train
self._y_train = y_train
self._X_test = X_test
self._y_test = y_test
self._param_grid = param_grid
self._engine = engine
self._grid = GridSearchCV(self._engine, self._param_grid, cv=10, scoring='accuracy')
self._pred = None
self._pred_prob = None
self._accuracy = None
self._roc = None
self._tpr = None
self._fpr = None
@property
def grid_search_result(self):
return pd.DataFrame(self._grid.cv_results_)
@property
def accuracy(self):
return self._accuracy
@property
def roc(self):
return self._roc
@property
def tpr(self):
return self._tpr
@property
def fpr(self):
return self._fpr
@property
def threshold(self):
return self._threshold
def analyze(self):
self._grid.fit(self._X_train, self._y_train)
self._pred = self._grid.predict(self._X_test)
self._fpr, self._tpr, self._threshold = roc_curve(self._y_test, self._pred)
try:
self._pred_prob = self._grid.predict_proba(self._X_test)
self._fpr, self._tpr, self._threshold = roc_curve(self._y_test, pd.DataFrame(self._pred_prob)[1])
except AttributeError as ae:
pass
self._accuracy = accuracy_score(self._y_test, self._pred)
self._roc = roc_auc_score(self._y_test, self._pred)
return self._grid
def show_performance(self):
print(f"ROC/AUC: {round(self._roc*100, 2)}%")
print()
print(classification_report(self._y_test, self._pred, target_names=["Disease","Health"]))
import matplotlib
class Visualizer():
@staticmethod
def group_plot_roc_curve(title, data_group):
plt.clf()
plt.figure(figsize=(5, 5), dpi=80)
x = [0.0, 1.0]
plt.plot(x, x, linestyle='dashed', color='red', linewidth=2, label='Naive prediction (Random guess)')
for idx, group in enumerate(data_group):
fpr = group[0]
tpr = group[1]
label = group[2]
linestyle= 'solid'
if idx % 2 == 1:
linestyle= 'dashed'
plt.plot(fpr, tpr, linestyle=linestyle, linewidth=2, label=label)
plt.xlim(0.0, 1.0)
plt.ylim(0.0, 1.0)
plt.xlabel("FPR", fontsize=14)
plt.ylabel("TPR", fontsize=14)
plt.legend(fontsize=10, loc='best')
plt.title(title, fontsize=14)
plt.tight_layout()
display()
@staticmethod
def plot_performance(data,
legend_type_name,
x_axis_name,
upper_y_label,
lower_y_label,
title):
plt.clf()
f, ax = plt.subplots(2, 1, figsize=(15,8))
legends = data[legend_type_name].unique()
for idx, legend in enumerate(legends):
_data = data[data[legend_type_name]==legend]
ax[0].plot(_data[x_axis_name], _data[upper_y_label], linewidth=2, label=f'{legend_type_name}: {legend}')
ax[0].set_xlabel(x_axis_name, fontsize=15)
ax[0].set_ylabel(upper_y_label.upper(), fontsize=15)
ax[0].legend(fontsize=10, loc='upper right')
ax[1].plot(_data[x_axis_name], _data[lower_y_label], linewidth=2, label=f'{legend_type_name}: {legend}')
ax[1].set_xlabel(x_axis_name, fontsize=15)
ax[1].set_ylabel(lower_y_label.upper(), fontsize=15)
ax[1].legend(fontsize=10, loc='lower right')
ax[0].set_title(f"Performance Evaluation of {title}", fontsize=24)
plt.tight_layout()
display()
@staticmethod
def plot_feature_importance(reg_coef, col_names, title):
reg_coef = pd.Series(reg_coef, index=col_names)
reg_coef = reg_coef.sort_values()
matplotlib.rcParams['figure.figsize'] = (8.0, 10.0)
reg_coef.plot(kind="barh",)
plt.title(title, fontsize=15)
@staticmethod
def plot_importance_trending(X_train, feature_importance_matrix, title):
feature_importance = feature_importance_matrix.groupby('C').agg(['mean'])[[*X_train.columns]]
feature_importance.columns = X_train.columns.tolist()
feature_importance['C'] = feature_importance.index
column_names = X_train.columns
lbds = feature_importance['C'].tolist()
coef_matrix = feature_importance[X_train.columns]
x_lab = 'Lambda'
y_lab = 'Weight'
plt.clf()
plt.figure(figsize=(15, 10))
for idx, col_name in enumerate(column_names):
plt.plot(lbds, coef_matrix.iloc[:,idx], 'o-', linewidth=2, label=col_name)
c = coef_matrix.iloc[0,idx]
plt.annotate(col_name, (lbds[3], coef_matrix.iloc[3,idx]))
plt.title(title, fontSize=25)
plt.xlabel(x_lab)
plt.ylabel(y_lab)
plt.legend(loc='upper right')
plt.tight_layout()
display()
from concurrent.futures.thread import ThreadPoolExecutor
import warnings
warnings.filterwarnings("ignore")
from concurrent.futures.thread import ThreadPoolExecutor
from sklearn.metrics import hinge_loss
def loss_accuracy_analyze_job_builder(X_train, y_train, X_test, y_test, model_func, param):
def _analyze_param_combination():
engine = AnalysisEngineBuilder() \
.set_X_train(X_train) \
.set_y_train(y_train) \
.set_X_test(X_test) \
.set_y_test(y_test) \
.set_param_grid(param) \
.set_engine(model_func) \
.build()
model = engine.analyze()
# Performance scores
loss = hinge_loss(y_test, pd.DataFrame(model.predict_proba(X_test))[1])
auc = roc_auc_score(y_test, model.predict(X_test))
coef = pd.Series(model.best_estimator_.coef_[0], index=X_test.columns).to_dict()
_param = param
for key, value in param.items():
_param[key] = value[0]
return {
'accuracy': engine.accuracy * 100,
'loss': loss,
'auc': auc,
**coef,
**_param
}
return _analyze_param_combination
# Refactor into the analyzer later on
def calculate_grid_performance(X_train, y_train, X_test, y_test, params, model):
# build combination list
combination_list = pd.DataFrame({'dummy': [1]})
for key, values in params.items():
combination_list = pd.merge(combination_list, pd.DataFrame({key: values, 'dummy': [1] * len(values)}))
combination_list.drop('dummy',axis=1, inplace=True)
# Train and extract scores
futures = list()
results = list()
# Execute models in threads
with ThreadPoolExecutor(max_workers=allocated_cpu) as executor:
for combination in combination_list.to_dict('records'):
combination = {key:[value] for key, value in combination.items()}
future_model = executor.submit(loss_accuracy_analyze_job_builder(X_train, y_train, X_test, y_test, model, combination))
futures.append(future_model)
return pd.DataFrame.from_dict([future.result() for future in futures])
from sklearn.linear_model import LogisticRegression
pipe = Pipeline([
('url_length_counter', URLLengthCounter()),
('url_depth_counter', URLDepthCounter()),
('has_www_converter', HasWWWConverter()),
('subdomain_level_counter', SubdomainLevelCounter()),
('request_parameter_counter', RequestParameterCounter()),
('domain_suffix_builder', DomainSuffixBuilder()),
('incorrect_domain_url_cleaner', IncorrectDomainUrlCleaner()),
('column_renamer', ColumnRenamer({'scheme': 'protocol_type'})),
# ('timeseries_converter', TimeseriesConverter()),
('feature_picker', FeaturePicker(['protocol_type',
'url_depth',
'has_www',
'subdomain_level',
'param_cnt',
'suffix',
'timestamp_coef',
'is_port_access',
'status'
])),
('frequency_indexer', categorical_encoders.CountFrequencyCategoricalEncoder(
encoding_method='frequency',
variables=['protocol_type'])),
('low_variance_remover', LowVarianceRemover(.005)), # Decreased to .005
('standard_scaler', CustomizedStandardizer(norm='l2')),
])
X = pipe.fit_transform(df)
X_train, X_test = train_test_split(X, test_size=0.3, random_state=seed)
y_train = X_train[:,-1]
X_train = pd.DataFrame(X_train, columns= pipe.steps[-1][1].columns)
X_train = X_train.drop('status', axis=1)
print(X_train.columns)
y_test = X_test[:,-1]
X_test = pd.DataFrame(X_test, columns= pipe.steps[-1][1].columns)
X_test = X_test.drop('status', axis=1)
print(X_test.columns)
start_time = time.time()
param_lr = {
'l1_ratio': [0, *np.logspace(-3, 0, 1)],
'C': np.logspace(-3, 0, 1),
'max_iter': np.arange(10,80,1),
}
lr = LogisticRegression(random_state=seed,
penalty='elasticnet',
solver='saga',
multi_class='ovr',
warm_start=False,
n_jobs=allocated_cpu,
)
# Start to train model
engine_lr = AnalysisEngineBuilder() \
.set_X_train(X_train) \
.set_y_train(y_train) \
.set_X_test(X_test) \
.set_y_test(y_test) \
.set_param_grid(param_lr) \
.set_engine(lr) \
.build()
model_lr = engine_lr.analyze()
engine_lr.show_performance()
t = str(datetime.timedelta(seconds=time.time() - start_time)).split(':')
print("--- %s minutes, %.2f seconds ---" % (t[1], float(t[2])))
import matplotlib.pyplot as plt
import seaborn as sns; sns.set()
Visualizer.group_plot_roc_curve('ROC Curve of Logistic Regression', [
(engine_lr.fpr, engine_lr.tpr, 'Logistic Regression')
])
param_lr = {
'l1_ratio': [0, *np.logspace(-3, 0, 5)],
'C': np.logspace(-3, 0, 5),
'max_iter': np.arange(10,80,40),
}
lr = LogisticRegression(random_state=seed,
penalty='elasticnet',
solver='saga',
multi_class='ovr',
warm_start=False,
n_jobs=allocated_cpu,
)
loss_accuracy_matrix = calculate_grid_performance(X_train, y_train, X_test, y_test, param_lr, lr)
Visualizer.plot_performance(data=loss_accuracy_matrix,
legend_type_name='l1_ratio',
x_axis_name='C',
upper_y_label='loss',
lower_y_label='accuracy',
title='Loss& Accuracy - Logistic Regression'
)
Visualizer.plot_importance_trending(X_train, loss_accuracy_matrix, 'Weight change on each feature')